Code

Beginners and expert citizen scientists prefer similar species on iNaturalist, but experts contribute in average almost hundred times more data

Authors

Rodrigo Montiel

Manuele Bazzichetto

Florencia Grattarola

Published

April 19, 2025

We aim to identify users’ recording preferences according to their level of experience on the iNaturalist platform in Uruguay in order to understand the possible biases associated with the data.

Workflow

  1. Data download
  2. Users’ ranking
  3. Species traits
  4. Statistical analyses
library(patchwork)
library(knitr)
library(effects)
library(car)
library(stargazer)
library(tmap)
tmap_mode('plot')
library(sf)
sf::sf_use_s2(FALSE)
library(httr)
library(jsonlite)
library(lubridate)
library(tidyverse)

Data download

Code
observations <- read_csv('data/NatUY_observations_03-05.csv',
                         guess_max = 140000)

Users’ ranking

First we need to detect those users that are not from Uruguay and remove the data generated by them from our dataset. To do this we created a function using the iNaturalist API.
The function getObserversNumObservations() takes a list of users’ ids (user_login_list) and a location (place_id), and retrieves: the user_id, user_login, and user_name, the number of observations and species of the user on iNat and on the location (in our case, Uruguay, place_id=7259), and and the date when the user created their account (user_created_at).

Code
getObserversNumObservations <- function(user_login_list, 
                                        place_id=7259){
  
  observers_num_observations <- tibble(user_id = numeric(), 
                                       observations_iNat = numeric(), 
                                       observations_NatUY = numeric(),
                                       species_iNat = numeric(), 
                                       species_NatUY = numeric(), 
                                       user_login = character(),
                                       user_created_at = lubridate::ymd_hms(), 
                                       user_name = character())
  num_results <- 1
  for (user_login in user_login_list) {
    if ((num_results %% 10) + 10 == 10) { 
      Sys.sleep(10) # The API needs a delay because otherwise it gives an error. 
      # Every 10 users, the code stops for 10 second
    }
    call <- str_glue('https://api.inaturalist.org/v1/observations/observers', 
                     '?user_login={user_login}&',
                     'place_id={place_id}')
    
    get_json_call <- GET(url = call) %>%
      content(as = 'text') %>%
      fromJSON(flatten = TRUE)
    
    if(!'error' %in% names(get_json_call)) {
      results <- as_tibble(get_json_call$results) 
      observer_num_observations <- 
        tibble(user_id = results$user_id,
               observations_iNat = results$user.observations_count,
               observations_NatUY = results$observation_count,
               species_iNat = results$user.species_count,
               species_NatUY = results$species_count,
               user_login = results$user.login,
               user_created_at = results$user.created_at,
               user_name = results$user.name)
      observers_num_observations <- rbind(observers_num_observations,
                                          observer_num_observations)
      cat(num_results, 'user:', user_login, ',',
          observer_num_observations$observations_iNat, 'observations on iNat', '\n')
    }
    else {
      observer_num_observations <- tibble(user_id = NA, 
                                          observations_iNat = NA, 
                                          observations_NatUY = NA,
                                          species_iNat = NA, 
                                          species_NatUY = NA, 
                                          user_login = user_login,
                                          user_created_at = NA, 
                                          user_name = NA)
      observers_num_observations <- rbind(observers_num_observations,
                                          observer_num_observations)
      cat('user:', user_login, '--> NOT FOUND', '\n')
    }
    num_results <- nrow(observers_num_observations) + 1
  }
  return(observers_num_observations)
}

users_dataset <- observations %>% distinct(user_login)

observers_num_observations <- getObserversNumObservations(users_dataset$user_login)

# write_csv(observers_num_observations,
#           'data/observers_num_observations.csv')

Next, we discard visitors by identifying the Uruguayan users as

Code
uruguayans <- observers_num_observations %>%
  mutate(proportion_natuy_inat = round(observations_NatUY*100/observations_iNat, 3),
         uruguayan = ifelse(proportion_natuy_inat>40 , 'yes', 'no')) %>% 
  filter(uruguayan == 'yes')

observations_uy <- filter(observations, user_login %in% uruguayans$user_login)

After discarding foreign users, we calculated the number of records uploaded (observations), the time active on the platform (taking the dates of the first and last records uploaded, activity_time) and the number of records over time (the total number of records uploaded divided by the active time, observations_by_time).

With these variables we first categorised users as expert, intermediate or beginner using the following criteria:

  • Expert: Has 1,000 records or more AND has been active on the platform for more than one year AND has a records/time ratio greater or equal to 0.6.
  • Intermediate: Has between 50 and 1,000 records AND has been active on the platform for more than 3 months AND has a records/time ratio greater than 0.2.
  • Beginner: Has less than 50 records AND has been active on the platform for less than 3 months

Finally, we ranked the users according to their level of experience in the platform by calculating an index, such that:

Index = \text{category\_score} \cdot \left( w_1 \cdot \text{observations}_{\text{norm}} + w_2 \cdot \text{activity\_time}_{\text{norm}} + w_3 \cdot \text{observations\_by\_time}_{\text{norm}} \right)

We gave the same weight to all the variables (w_1 = w_2 = w_3 = 1/3).

We used the category_score to account for the categories we had previously established, thus, the outcome is a ranking that has experts, intermediates, and beginners sorted within each category.

Code
normalise <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}

users_dataset <- observations_uy %>% 
    group_by(user_login) %>%
    summarise(
        first_record = min(created_at),
        last_record = max(created_at),
        observations = n(),
        activity_time = as.numeric(difftime(last_record, first_record, units = 'days')) + 1,
        observations_by_time = observations / activity_time
    ) %>%
    filter(observations >= 3 & activity_time > 3) %>%
    mutate(
        user_category = ifelse(
            observations >= 1000 & activity_time >= 365 & observations_by_time >= 0.6, 'expert',
            ifelse(observations >= 50 & activity_time > 90 & observations_by_time > 0.2, 'intermediate', 'beginner')
        ),
        category_score = case_when(
            user_category == 'expert' ~ 10,
            user_category == 'intermediate' ~ 1,
            user_category == 'beginner' ~ 0.01
        )
    ) %>%
    mutate(
        # Normalize variables
        observations_norm = normalise(observations),
        activity_time_norm = normalise(activity_time),
        observations_by_time_norm = normalise(observations_by_time)
    ) %>%
    mutate(
        # Define weights for variables
        w1 = 2/3,  # observations
        w2 = 1/6,  # activity_time
        w3 = 1/6,  # observations_by_time
        # Calculate the index with a category score multiplier
        index = category_score * (
            w1 * observations_norm +
            w2 * activity_time_norm +
            w3 * observations_by_time_norm
        )
    ) %>%
    # Rank users based on the index
    arrange(desc(index)) %>%
    mutate(ranking = row_number())

# write_csv(users_dataset, 'data/users_dataset.csv')
Code
# top 5 users per category
users_dataset %>% 
  group_by(category_score) %>% 
  slice_head(n = 5) %>% ungroup() %>% 
  select(ranking, user_category,user_login, 
         observations, activity_time, observations_by_time) %>% 
  arrange(ranking) %>% 
  rename(Ranking = ranking, 
         Category = user_category, 
         User = user_login,
         `N of observations` = observations,
         `Activity (in days)` = activity_time,
         `N of observations per day` = observations_by_time)  %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Top 5 users per expertise category
Ranking Category User N of observations Activity (in days) N of observations per day
1 expert santiagomailhos 7,106 1,909.61 3.72
2 expert luisvescia 5,030 858.86 5.86
3 expert ornitoloca 3,554 1,342.43 2.65
4 expert enriquecenoz 3,096 579.77 5.34
5 expert msilvera 2,484 1,673.52 1.48
21 intermediate mirmeleon 1,121 2,146.10 0.52
22 intermediate rafatosi 1,143 1,995.26 0.57
23 intermediate gabriellaufer 853 1,642.21 0.52
24 intermediate leo_lagos 794 1,685.25 0.47
25 intermediate amailhos 638 1,907.84 0.33
113 beginner lyn_loveless 285 14.88 19.15
114 beginner intiporley 4 3,165.88 0.00
115 beginner smantaras 382 1,994.36 0.19
116 beginner gusper 224 2,177.10 0.10
117 beginner mariusvk 166 13.46 12.34
Code
# users per category
users_dataset %>% 
  group_by(category_score, user_category) %>% 
  count() %>% ungroup() %>% 
  arrange(category_score) %>% select(-category_score) %>%  
  mutate(`%` = scales::label_percent()(n / sum(n))) %>% 
  rename(Category= user_category,
         `N of users`=n)  %>% 
  janitor::adorn_totals() %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Users per category
Category N of users %
beginner 824 88.0%
intermediate 92 9.8%
expert 20 2.1%
Total 936 -
Code
# observations per category
left_join(observations_uy, users_dataset %>% 
              select(user_login, observations,category_score, user_category, ranking)) %>% 
    filter(!is.na(user_category)) %>% 
    group_by(category_score, user_category) %>% 
    count() %>% ungroup() %>% 
    arrange(category_score) %>% select(-category_score) %>%  
    mutate(`%` = scales::label_percent()(n / sum(n))) %>% 
    rename(Category= user_category,
           `N of observations`=n)  %>% 
    janitor::adorn_totals() %>% 
    kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
    kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Number and % of users per category
Category N of observations %
beginner 21,498 22.2%
intermediate 30,351 31.4%
expert 44,808 46.4%
Total 96,657 -
  • Average number of records for a beginner = 26.1
  • Average number of records for an intermediate = rround(30351/92, 1)`
  • Average number of records for an expert = rround(44808/20, 1)`

Species traits

First, we exported the list of species for tetrapods and plants.

Code
species_list <- observations_uy %>% 
  filter(quality_grade == 'research') %>% 
  select(taxon_kingdom_name, taxon_phylum_name, 
         taxon_class_name, taxon_order_name, taxon_family_name,
         taxon_genus_name, taxon_species_name) %>%
  filter(str_count(taxon_species_name, '\\S+') == 2)

# (str_count(scientific_name, '\\S+') ==2) allows us to select 
# those records that have two words in the scientific_name field

## tetrapods
tetra <- species_list %>% 
  filter(taxon_class_name == 'Aves' |
           taxon_class_name == 'Amphibia' |
           taxon_class_name == 'Mammalia' |
           taxon_class_name == 'Reptilia') %>% 
  group_by(taxon_class_name, taxon_species_name) %>% 
  count()

# write_csv(tetra,'data/tetra_list.csv')

## plants
dico <- species_list %>% 
  filter(taxon_family_name == 'Fabaceae' | 
           taxon_family_name == 'Cactaceae' | 
           taxon_family_name == 'Asteraceae'|
           taxon_family_name == 'Solanaceae') %>% 
  group_by(taxon_family_name, taxon_species_name) %>% 
  count()

# write_csv(plants,'data/dico_list.csv')

Then, using the list of species, we conducted a literature search, at the national level, to identify the following traits for each species: distribution area, conservation status and body size or growth form depending on whether it was an animal or a plant.

Code
tetrapods_traits <- read_csv('data/tetrapods_traits.csv')
plants_traits <- read_csv('data/plants_traits.csv')

We identified 33 tetrapod species and 52 plant species listed as non-native or domestic/cultivated (e.g. dog, horse, tobacco) in Uruguay, and discarded them from the analyses.

Code
plants_traits %>% 
  filter(grepl('non', remarks)) %>%
  distinct(taxon_family_name, taxon_species_name) %>% 
  arrange(taxon_family_name, taxon_species_name) %>% 
  kableExtra::kbl(booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position')) %>% 
  kableExtra::scroll_box(height = '300px')
Tetrapod species that were discarded
taxon_family_name taxon_species_name
Asteraceae Bidens aurea
Asteraceae Calendula officinalis
Asteraceae Carduus nutans
Asteraceae Carduus tenuiflorus
Asteraceae Cladanthus mixtus
Asteraceae Cosmos bipinnatus
Asteraceae Cosmos sulphureus
Asteraceae Cotula nigellifolia
Asteraceae Dimorphotheca ecklonis
Asteraceae Dittrichia viscosa
Asteraceae Euryops chrysanthemoides
Asteraceae Gazania rigens
Asteraceae Helenium amarum
Asteraceae Helianthus annuus
Asteraceae Hypochaeris glabra
Asteraceae Lactuca sativa
Asteraceae Pseudogynoxys chenopodioides
Asteraceae Senecio angulatus
Asteraceae Senecio tamoides
Asteraceae Tanacetum parthenium
Asteraceae Taraxacum erythrospermum
Asteraceae Tragopogon porrifolius
Asteraceae Youngia japonica
Asteraceae Zinnia elegans
Cactaceae Austrocylindropuntia subulata
Cactaceae Opuntia cochenillifera
Cactaceae Opuntia ficus-indica
Cactaceae Parodia fusca
Cactaceae Selenicereus undatus
Fabaceae Acacia elata
Fabaceae Acacia mearnsii
Fabaceae Acacia podalyriifolia
Fabaceae Acacia retinodes
Fabaceae Albizia julibrissin
Fabaceae Bauhinia variegata
Fabaceae Glycine max
Fabaceae Lathyrus latifolius
Fabaceae Lathyrus odoratus
Fabaceae Senna didymobotrya
Fabaceae Trifolium angustifolium
Fabaceae Trifolium fragiferum
Fabaceae Trifolium subterraneum
Fabaceae Vachellia karroo
Fabaceae Vicia villosa
Fabaceae Wisteria sinensis
Solanaceae Brugmansia arborea
Solanaceae Nicandra physalodes
Solanaceae Nicotiana tabacum
Solanaceae Physalis peruviana
Solanaceae Solanum lycopersicum
Solanaceae Solanum tuberosum
Solanaceae Streptosolen jamesonii
Code
tetrapods_traits %>% 
  filter(grepl('non', remarks)) %>%
  distinct(taxon_class_name, taxon_species_name) %>% 
  arrange(taxon_class_name, taxon_species_name) %>% 
  kableExtra::kbl(booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position')) %>% 
  kableExtra::scroll_box(height = '300px')
Plant species that were discarded
taxon_class_name taxon_species_name
Amphibia Lithobates catesbeianus
Aves Agapornis personatus
Aves Amazona aestiva
Aves Anas platyrhynchos
Aves Anser anser
Aves Aratinga nenday
Aves Carduelis carduelis
Aves Catharus ustulatus
Aves Chloris chloris
Aves Columba livia
Aves Corvus splendens
Aves Fulmarus glacialis
Aves Gallus gallus
Aves Meleagris gallopavo
Aves Melopsittacus undulatus
Aves Motacilla alba
Aves Numida meleagris
Aves Nymphicus hollandicus
Aves Passer domesticus
Aves Pavo cristatus
Aves Sturnus vulgaris
Mammalia Axis axis
Mammalia Canis familiaris
Mammalia Dama dama
Mammalia Felis catus
Mammalia Lepus europaeus
Mammalia Mus musculus
Mammalia Oryctolagus cuniculus
Mammalia Rattus norvegicus
Mammalia Rattus rattus
Mammalia Sus scrofa
Reptilia Hemidactylus mabouia
Reptilia Tarentola mauritanica

Final datasets

Finally we combine the observations, with the users’ categorisation and the species traits, and create two tables (for tetrapods and plants), with observations as rows.

Code
observations_dataset <- left_join(observations_uy, 
                             users_dataset %>% 
                               select(user_login, 
                                      observations, 
                                      user_category, 
                                      ranking)) %>% 
  filter(!is.na(user_category))

# write_csv(observations_dataset, 'data/observations_dataset.csv')

And then, we classify the quantitative traits using qualitative values.

Trait Quantitative value Qualitative value Criteria for qualitative value
Distribution area Number of departments where the species is recorded at (from a total of 19) Narrow Present in at least 5 departments
Medium Present in 6 to 16 departments
Wide Present in 17 or more departments
Body size (tetrapods) Average body length of the species in centimetres Small Mammals < 50cm
Birds < 20cm
Reptiles < 50cm
Amphibians < 5cm
Medium Mammals >= 50cm and < 200cm
Birds >= 20cm and < 50 cm
Reptiles >= 50cm and < 100cm
Amphibians >= 5cm and < 10cm
Large Mammals >= 200cm
Birds >= 50cm
Reptiles >= 100cm
Amphibians >= 10cm
Growth form
(plants)
Herb
Vine
Liana
Subshrub
Shrub
Tree
Classification according to Darwinion (Zuloaga et al., 2019)
Conservation status Least concern (LC)
Near threatened (NT)
Vulnerable (VU)
Endangered (EN)
Critically endangered (CR)
Not evaluated (NE)
Data deficient (DD)
Classification according to the International Union for Conservation of Nature (IUCN, 2023)

Tetrapods’ traits

Code
# tetrapods
tetra_data <- left_join(observations_dataset, 
                        tetrapods_traits %>% 
                          filter(!grepl('non', remarks))) %>% 
  filter(!is.na(distribution_area)) %>% 
  mutate(distribution = case_when(distribution_area <= 5 ~ 'narrow',
                                  distribution_area > 5 & distribution_area <= 16 ~ 'medium',
                                  distribution_area > 16 ~ 'wide', 
                                  is.na(distribution_area) ~ 'not assessed')) %>%
  mutate(size = case_when(taxon_class_name == 'Mammalia' & 
                            body_size < 50 ~ 'small',
                          taxon_class_name == 'Mammalia' & 
                            body_size >= 50 & body_size < 200 ~ 'medium',
                          taxon_class_name == 'Mammalia' & 
                            body_size >= 200 ~ 'large',
                          taxon_class_name == 'Amphibia' & 
                            body_size < 5 ~ 'small',
                          taxon_class_name == 'Amphibia' & 
                            body_size >= 5 & body_size < 10 ~ 'medium',
                          taxon_class_name == 'Amphibia' & 
                            body_size >= 10 ~ 'large',
                          taxon_class_name == 'Reptilia' & 
                            body_size < 50 ~ 'small',
                          taxon_class_name == 'Reptilia' & 
                            body_size >= 50 & body_size < 100 ~ 'medium',
                          taxon_class_name == 'Reptilia' & 
                            body_size >= 100 ~ 'large',
                          taxon_class_name == 'Aves' & 
                            body_size < 20 ~ 'small',
                          taxon_class_name == 'Aves' & 
                            body_size >= 20 & body_size < 50 ~ 'medium',
                          taxon_class_name == 'Aves' & 
                            body_size >= 50 ~ 'large'))

# write_csv(tetra_data, "data/tetra_data.csv")

Plants’ traits

Code
plants_data <- left_join(observations_dataset,
                        plants_traits %>% 
                         filter(!grepl('non', remarks))) %>%
  filter(!is.na(distribution_area)) %>% 
  mutate(distribution = case_when(distribution_area <= 5 ~ 'narrow',
                                  distribution_area > 5 & distribution_area <= 16 ~ 'medium',
                                  distribution_area > 16 ~ 'wide', 
                                  is.na(distribution_area) ~ 'not assessed')) 

# write_csv(dico_data, "data/dico_data.csv")

Scale variables

Now let’s transform the data, ordering the qualitative variables and scaling the quantitative variables. Then, we filter out users with less than 3 records.

Code
tetrapods <- tetra_data %>% 
  group_by(user_login) %>% 
  mutate(n_observations = n()) %>% ungroup() %>% 
  filter(n_observations>=3) %>%
  mutate(user_category = factor(user_category, 
                         levels = c('expert',
                                    'intermediate',
                                    'beginner'))) %>% 
  mutate(status = factor(conservation_status, 
                         levels = c('CR','EN','VU', 
                                    'NT', 'DD', 'NE', 
                                    'LC'))) %>% 
  select(ranking, 
         dist_class=distribution, size_class=size,
         dist=distribution_area, size=body_size, 
         status, taxon=taxon_class_name,
         state=place_state_name,
         user_category, user_login, taxon_species_name,
         latitude, longitude) %>% 
  mutate(size_scaled = scale(size, center = TRUE)[,1], 
         dist_scaled = scale(dist, center = TRUE)[,1],
         log_dist = log(dist))

plants <- plants_data %>% 
  group_by(user_login) %>% 
  mutate(n_observations = n()) %>% ungroup() %>% 
  filter(n_observations>=3) %>% 
  mutate(user_category = factor(user_category, 
                         levels = c('expert',
                                    'intermediate',
                                    'beginner'))) %>% 
  mutate(status = factor(conservation_status,
                         levels = c('CR','EN','VU', 
                                    'NT', 'DD', 'NE', 
                                    'LC')),
         growth = factor(growth_form, 
                         levels = c('herb', 'vine', 
                                    'liana', 'subshrub',
                                    'shrub', 'tree'))) %>% 
  select(ranking, dist=distribution_area, 
         dist_class = distribution, growth, 
         status, taxon=taxon_family_name,
         state=place_state_name,
         user_category, user_login, taxon_species_name, 
         latitude, longitude) %>% 
  mutate(dist_scaled = scale(dist, center = TRUE)[,1],
         log_dist = log(dist))

Average of species-variable per user

And finish up with aggregating records per user and computing the mean and SD of the values of the two traits. For tetrapods distribution area and body size, and for plants distribution area.

Code
tetrapods_per_user <- tetrapods %>% 
  group_by(user_login) %>% 
  summarise(mean_dist = mean(dist),
            mean_size = mean(size),
            sd_size = sd(size), 
            sd_dist = sd(dist),
            ranking = first(ranking),
            user_category = first(user_category))

plants_per_user <- plants %>% 
  group_by(user_login) %>% 
  summarise(mean_dist = mean(dist),
            sd_dist = sd(dist),
            ranking = first(ranking),
            user_category = first(user_category))

# ggplot(tetrapods_per_user, aes(x = mean_dist, y=ranking)) +
#     geom_point(aes(col=user_category), alpha = 0.7,) +
#     labs(x = 'mean(distribution per user)', col='') +
#   ggpubr::theme_pubclean() +
#   theme(legend.position = 'bottom')
# 
# ggplot(tetrapods_per_user, aes(x = sd_dist, y=ranking)) +
#     geom_point(aes(col=user_category), alpha = 0.7,) +
#     labs(x = 'SD(distribution per user)', col='') +
#   ggpubr::theme_pubclean() +
#   theme(legend.position = 'bottom')
# 
# ggplot(plants_per_user, aes(x = sd_dist, y=ranking)) +
#     geom_point(aes(col=user_category), alpha = 0.7,) +
#     labs(x = 'SD(distribution per user)', col='') +
#   ggpubr::theme_pubclean() +
#   theme(legend.position = 'bottom')
# 
# ggplot(plants_per_user, aes(x = mean_dist, y=ranking)) +
#     geom_point(aes(col=user_category), alpha = 0.7,) +
#     labs(x = 'mean(distribution per user)', col='') +
#   ggpubr::theme_pubclean() +
#   theme(legend.position = 'bottom')

# hist(tetrapods_per_user$mean_dist,
#      xlab = 'mean(distribution per user)',
#      main='tetrapods')
# hist(tetrapods_per_user$sd_dist, 
#      xlab = 'SD(distribution per user)', 
#      main='tetrapods')
# 
# hist(tetrapods_per_user$mean_size, 
#      xlab = 'mean(size per user)', 
#      main='tetrapods')
# hist(tetrapods_per_user$sd_size, 
#      xlab = 'SD(size per user)', 
#      main='tetrapods')
# 
# hist(plants_per_user$mean_dist, 
#      xlab = 'mean(distribution per user)', 
#      main='plants')
# hist(plants_per_user$sd_dist, 
#      xlab = 'SD(distribution per user)', 
#      main='plants')

Summary of the data

Code
tibble(Group = c('tetrapods', 'plants'),
       Users = c(nrow(tetrapods %>%
                          distinct(user_login)),
                   nrow(plants %>%
                        distinct(user_login))),
       Observations = c(nrow(tetrapods), nrow(plants)),
       Species = c(nrow(tetrapods %>%
                          distinct(taxon_species_name)),
                   nrow(plants %>%
                        distinct(taxon_species_name)))) %>%
  janitor::adorn_totals() %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Summary of the data analysed
Group Users Observations Species
tetrapods 358 22,918 602
plants 291 10,821 530
Total 649 33,739 1,132
Code
tetrapods %>% 
  group_by(taxon) %>% 
  summarise(observations=n(), 
            species=n_distinct(taxon_species_name)) %>% 
  janitor::adorn_totals() %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Observations for tetrapods
taxon observations species
Amphibia 1,768 35
Aves 17,776 441
Mammalia 1,737 68
Reptilia 1,637 58
Total 22,918 602
Code
tetrapods %>% 
  distinct(user_login, .keep_all = T) %>%
  group_by(user_category) %>% 
  count() %>% ungroup() %>% 
  mutate(`%` = scales::label_percent()(n / sum(n))) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Users for tetrapods
user_category n %
expert 20 6%
intermediate 80 22%
beginner 258 72%
Code
plants %>% 
  group_by(taxon) %>% 
  summarise(observations=n(), 
            species=n_distinct(taxon_species_name)) %>% 
  janitor::adorn_totals() %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Observations for plants
taxon observations species
Asteraceae 6,023 284
Cactaceae 1,240 44
Fabaceae 2,230 143
Solanaceae 1,328 59
Total 10,821 530
Code
plants %>% 
  distinct(user_login, .keep_all = T) %>%
  group_by(user_category) %>% 
  count() %>% ungroup() %>% 
  mutate(`%` = scales::label_percent()(n / sum(n))) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Users for plants
user_category n %
expert 18 6%
intermediate 72 25%
beginner 201 69%

Tetrapods

Code
tetrapods %>% 
  group_by(dist_class) %>% 
  summarise(n = n()) %>% 
  mutate(freq = scales::label_percent(accuracy=1)(n/sum(n))) %>% 
  rename(`distribution area` = dist_class,
         `observations` = n,
         `%` = freq) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Classification of species by distribution area
distribution area observations %
medium 2,970 13%
narrow 1,115 5%
wide 18,833 82%
Code
tetrapods %>% 
    group_by(user_category, dist_class) %>% 
    summarise(n = n()) %>% 
    mutate(freq = scales::label_percent(accuracy=1)(n/sum(n))) %>% 
   pivot_wider(names_from ='user_category', 
               values_from = c('n', 'freq'))%>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Species’ distribution area by user category
dist_class n_expert n_intermediate n_beginner freq_expert freq_intermediate freq_beginner
medium 1,773 881 316 15% 11% 10%
narrow 646 333 136 6% 4% 4%
wide 9,197 6,909 2,727 79% 85% 86%
Code
tetrapods %>% 
    group_by(size_class) %>% 
    summarise(n = n()) %>% 
    mutate(freq = scales::label_percent(accuracy=1)(n/sum(n)))%>% 
    rename(`body size` = size_class,
         `observations` = n,
         `%` = freq) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Classification of species by size
body size observations %
large 4,449 19%
medium 9,802 43%
small 8,667 38%
Code
tetrapods %>% 
    group_by(user_category, size_class) %>% 
    summarise(n = n()) %>% 
    mutate(freq = scales::label_percent(accuracy=1)(n/sum(n))) %>% 
   pivot_wider(names_from ='user_category', 
               values_from = c('n', 'freq')) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Species’ size by user category
size_class n_expert n_intermediate n_beginner freq_expert freq_intermediate freq_beginner
large 2,045 1,696 708 18% 21% 22%
medium 4,903 3,562 1,337 42% 44% 42%
small 4,668 2,865 1,134 40% 35% 36%
Code
tetrapods %>% 
    group_by(status) %>% 
    summarise(n = n()) %>% 
    mutate(freq = scales::label_percent(accuracy=1)(n/sum(n))) %>% 
      rename(`conservation satus` = status,
         `observations` = n,
         `%` = freq) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Classification of species by conservation status
conservation satus observations %
CR 2 0%
EN 103 0%
VU 334 1%
NT 835 4%
DD 52 0%
NE 901 4%
LC 20,691 90%
Code
tetrapods %>% 
    group_by(user_category, status) %>% 
    summarise(n = n()) %>% 
    mutate(freq = scales::label_percent(accuracy=1)(n/sum(n))) %>% 
   pivot_wider(names_from ='user_category', 
               values_from = c('n', 'freq')) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Species’ conservation status by user category
status n_expert n_intermediate n_beginner freq_expert freq_intermediate freq_beginner
CR 1 NA 1 0% NA 0%
EN 70 29 4 1% 0% 0%
VU 178 104 52 2% 1% 2%
NT 476 269 90 4% 3% 3%
DD 16 26 10 0% 0% 0%
NE 319 394 188 3% 5% 6%
LC 10,556 7,301 2,834 91% 90% 89%

Plants

Code
##### PLANTS

plants %>% 
    group_by(dist_class) %>% 
    summarise(n = n()) %>% 
    mutate(freq = scales::label_percent(accuracy=1)(n/sum(n))) %>% 
  rename(`distribution area` = dist_class,
         `observations` = n,
         `%` = freq) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Classification of species by distribution area
distribution area observations %
medium 6,324 58%
narrow 4,283 40%
wide 214 2%
Code
plants %>% 
    group_by(user_category, dist_class) %>% 
    summarise(n = n()) %>% 
    mutate(freq = scales::label_percent(accuracy=1)(n/sum(n))) %>% 
   pivot_wider(names_from ='user_category', 
               values_from = c('n', 'freq')) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Species’ distribution area by user category
dist_class n_expert n_intermediate n_beginner freq_expert freq_intermediate freq_beginner
medium 2,886 2,130 1,308 58% 60% 57%
narrow 1,968 1,385 930 40% 39% 41%
wide 118 57 39 2% 2% 2%
Code
plants %>% 
    group_by(growth) %>% 
    summarise(n = n()) %>% 
    mutate(freq = scales::label_percent(accuracy=1)(n/sum(n))) %>% 
    rename(`growth form` = growth,
         `observations` = n,
         `%` = freq) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Classification of species by size
growth form observations %
herb 6,435 59%
vine 116 1%
liana 75 1%
subshrub 1,278 12%
shrub 2,173 20%
tree 744 7%
Code
plants %>% 
    group_by(user_category, growth) %>% 
    summarise(n = n()) %>% 
    mutate(freq = scales::label_percent(accuracy=1)(n/sum(n))) %>% 
   pivot_wider(names_from ='user_category', 
               values_from = c('n', 'freq')) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Species’ size by user category
growth n_expert n_intermediate n_beginner freq_expert freq_intermediate freq_beginner
herb 3,034 2,069 1,332 61% 58% 58%
vine 58 32 26 1% 1% 1%
liana 32 25 18 1% 1% 1%
subshrub 622 379 277 13% 11% 12%
shrub 955 757 461 19% 21% 20%
tree 271 310 163 5% 9% 7%
Code
plants %>% 
    group_by(status) %>% 
    summarise(n = n()) %>% 
    mutate(freq = scales::label_percent(accuracy=1)(n/sum(n))) %>%
  rename(`conservation status` = status,
         `observations` = n,
         `%` = freq) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Classification of species by conservation status
conservation status observations %
CR 4 0%
EN 23 0%
VU 259 2%
NT 12 0%
DD 4 0%
NE 8,633 80%
LC 1,886 17%
Code
plants %>% 
    group_by(user_category, status) %>% 
    summarise(n = n()) %>% 
    mutate(freq = scales::label_percent(accuracy=1)(n/sum(n))) %>% 
   pivot_wider(names_from ='user_category', 
               values_from = c('n', 'freq')) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Species’ conservation status by user category
status n_expert n_intermediate n_beginner freq_expert freq_intermediate freq_beginner
EN 11 9 3 0% 0% 0%
VU 161 61 37 3% 2% 2%
NT 3 8 1 0% 0% 0%
DD 1 1 2 0% 0% 0%
NE 3,978 2,825 1,830 80% 79% 80%
LC 818 664 404 16% 19% 18%
CR NA 4 NA NA 0% NA

Plots

Per group and user category

Code
data_sf <- bind_rows(tetrapods,plants) %>% 
  sf::st_as_sf(coords=c('longitude', 'latitude')) %>% 
  sf::st_set_crs(4326) %>% 
  mutate(taxon = factor(taxon, levels = c('Amphibia', 'Aves', 
                                          'Reptilia', 'Mammalia',
                                          'Asteraceae', 'Cactaceae',
                                          'Fabaceae', 'Solanaceae')))

uruguay_dptos <- readRDS('data/Uruguay.rds') %>% st_transform(4326)

argentina <- rnaturalearth::ne_countries(country = 'argentina', scale=50)
brazil <- rnaturalearth::ne_countries(country = 'brazil', scale=50)
southamerica <- rnaturalearth::ne_countries(continent = 'south america', scale=50)

ggplot() +
  geom_sf(data=uruguay_dptos, fill='white') +
  geom_sf(data=argentina, fill='grey85') +
  geom_sf(data=brazil, fill='grey90') +
  geom_sf(data=data_sf %>% filter(taxon %in% c('Amphibia',
                                               'Aves',
                                               'Mammalia',
                                               'Reptilia')), 
          aes(col=taxon), show.legend = F) +
  geom_sf(data=data_sf %>% filter(taxon %in% c('Asteraceae',
                                               'Cactaceae',
                                               'Fabaceae',
                                               'Solanaceae')), 
          aes(col=taxon), show.legend = F) +
  scale_color_brewer(palette = 'Set2') +
  coord_sf(xlim = c(-59, -52.5), ylim = c(-35.5, -29.5), expand = FALSE) +
  facet_wrap(~taxon, ncol = 4, ) + 
  theme_bw()

Code
southamerica_map <- ggplot() +
  geom_sf(data=southamerica, fill='grey95', col='grey45') +
  geom_sf(data=st_union(uruguay_dptos), fill='red') +
  coord_sf(xlim = c(-85, -32), ylim = c(-60, 15), expand = FALSE) +
  theme_void()

tetra_map <- ggplot() +
  geom_sf(data=uruguay_dptos, fill='white') +
  geom_sf(data=argentina, fill='grey85') +
  geom_sf(data=brazil, fill='grey90') +
  geom_sf(data=data_sf %>% filter(taxon %in% c('Amphibia',
                                               'Aves',
                                               'Mammalia',
                                               'Reptilia')), 
          aes(col=taxon)) +
  # scale_color_brewer(palette = 'Paired') +
  scale_color_manual(values = RColorBrewer::brewer.pal(n = 8, name = "Paired")[6:9]) +
  coord_sf(xlim = c(-59, -52.5), ylim = c(-35.5, -29.5), expand = FALSE) +
  theme_bw() + labs(col='', title = 'Tetrapods') +
  theme(legend.position = 'bottom')   


plants_map <- ggplot() +
  geom_sf(data=uruguay_dptos, fill='white') +
  geom_sf(data=argentina, fill='grey85') +
  geom_sf(data=brazil, fill='grey90') +
  geom_sf(data=data_sf %>% filter(taxon %in% c('Asteraceae',
                                               'Cactaceae',
                                               'Fabaceae',
                                               'Solanaceae')), 
          aes(col=taxon)) +
  scale_color_brewer(palette = 'Paired') +
  coord_sf(xlim = c(-59, -52.5), ylim = c(-35.5, -29.5), expand = FALSE) +
  theme_bw() + labs(col='', title = 'Plants') +
  theme(legend.position = 'bottom')           

southamerica_map | tetra_map | plants_map

Code
# ggsave(file='figs/observations_maps.svg',
#        plot=(southamerica_map | tetra_map | plants_map),
#        width=12, height=5, dpi = 100)

Spatial coverage (grids)

Code
# create grid
uruguay_grid <- st_make_grid(st_union(uruguay_dptos), 0.1) %>%
  st_intersection(st_union(uruguay_dptos)) %>% 
  st_sf(grid_id=1:length(.), geometry= .) %>% 
  st_make_valid() %>% st_cast() 

# convert observations to an sf object
observations_dataset_sf <- observations_dataset %>% 
  sf::st_as_sf(coords=c('longitude', 'latitude')) %>% 
  sf::st_set_crs(4326) %>% 
  mutate(user_category = factor(user_category, 
                                levels = c('beginner', 
                                           'intermediate',
                                           'expert')))

expert_grid <- st_join(uruguay_grid,
                       observations_dataset_sf %>% 
                         filter(user_category=='expert')) %>% 
  group_by(grid_id) %>% 
  summarise(n_observations=ifelse(n_distinct(taxon_species_name, 
                                             na.rm=T)!=0, n(), 0)) %>% 
  st_cast()

intermediate_grid <- st_join(uruguay_grid,
                             observations_dataset_sf %>%
                               filter(user_category=='intermediate')) %>% 
  group_by(grid_id) %>% 
  summarise(n_observations=ifelse(n_distinct(taxon_species_name, 
                                             na.rm=T)!=0, n(), 0)) %>% 
  st_cast()

beginner_grid <- st_join(uruguay_grid,
                         observations_dataset_sf %>% 
                           filter(user_category=='beginner')) %>% 
  group_by(grid_id) %>% 
  summarise(n_observations=ifelse(n_distinct(taxon_species_name, 
                                             na.rm=T)!=0, n(), 0)) %>% 
  st_cast()


coverage <- tibble(experts = expert_grid %>% 
                     st_drop_geometry() %>% 
                     summarise(coverage=sum(n_observations>0) * 100 / n()) %>% 
                     pull,
                   intermediates =  intermediate_grid %>%
                     st_drop_geometry() %>%
                     summarise(coverage=sum(n_observations>0) * 100 / n()) %>% 
                     pull,
                   beginners = beginner_grid %>% st_drop_geometry() %>%
                     summarise(coverage=sum(n_observations>0) * 100 / n()) %>% 
                     pull)

experts_map <- tm_graticules(alpha = 0.3) +
  tm_shape(expert_grid %>% 
             mutate(n_observations=ifelse(n_observations==0, 
                                          NA, n_observations))) +
  tm_polygons(fill='n_observations', fill_alpha = 0.9,
              col='grey90', col_alpha = 0.2,
              fill.scale = tm_scale_intervals(n = 6, 
                                              style = 'jenks', 
                                              values = 'brewer.greens',
                                              value.na = 'grey80',
                                              label.na = '0'),
              fill.legend = tm_legend(item.space = 0, item.na.space = 0,
                                      title = 'Number of observations',
                                      reverse = T, frame = F)) +
  tm_shape(uruguay_dptos) +
  tm_borders(col='grey60', fill_alpha = 0.4) +
  tm_layout(legend.outside = T, frame.lwd = 0.2, frame.r = 0) +
  # tm_credits(paste0(round(coverage$experts,0), 
  #                   '% cells covered'), position = c(0.6,0.9), size=1) +
  tm_title('Experts', position = tm_pos_out('right', 'top'))

intermediates_map <- tm_graticules(alpha = 0.3) +
  tm_shape(intermediate_grid %>% 
             mutate(n_observations=ifelse(n_observations==0, 
                                          NA, n_observations))) +
  tm_polygons(fill='n_observations', fill_alpha = 0.9,
              col='grey90', col_alpha = 0.2,
              fill.scale = tm_scale_intervals(n = 6, 
                                              style = 'jenks', 
                                              values = 'brewer.reds',
                                              value.na = 'grey80',
                                              label.na = '0'),
              fill.legend = tm_legend(item.space = 0, item.na.space = 0,
                                      title = 'Number of observations',
                                      reverse = T, frame = F)) +
  tm_shape(uruguay_dptos) +
  tm_borders(col='grey60', fill_alpha = 0.4) +
  tm_layout(legend.outside = T, frame.lwd = 0.2, frame.r = 0) +
  # tm_credits(paste0(round(coverage$intermediates,0), 
  #                   '% cells covered'), position = c(0.6,0.9), size=1) +
  tm_title('Intermediates', position = tm_pos_out('right', 'top'))

beginners_map <- tm_graticules(alpha = 0.3) +
  tm_shape(beginner_grid %>% 
             mutate(n_observations=ifelse(n_observations==0, 
                                          NA, n_observations))) +
  tm_polygons(fill='n_observations', fill_alpha = 0.9,
              col='grey90', col_alpha = 0.2,
              fill.scale = tm_scale_intervals(n = 6, 
                                              style = 'jenks', 
                                              values = 'brewer.blues',
                                              value.na = 'grey80',
                                              label.na = '0'),
              fill.legend = tm_legend(item.space = 0, item.na.space = 0,
                                      title = 'Number of observations',
                                      reverse = T, frame = F)) +
  tm_shape(uruguay_dptos) +
  tm_borders(col='grey60', fill_alpha = 0.4) +
  tm_layout(legend.outside = T, frame.lwd = 0.2, frame.r = 0) +
  # tm_credits(paste0(round(coverage$beginners,0), 
  #                   '% cells covered'), position = c(0.6,0.9), size=1) +
  tm_title('Beginners', position = tm_pos_out('right', 'top'))

experts_map

Code
intermediates_map

Code
beginners_map

Code
# tmap_save(tm = experts_map, width = 8, height = 6,
#           filename = 'figs/experts_grid.svg', dpi = 100)
# tmap_save(tm = intermediates_map, width = 8, height = 6,
#           filename = 'figs/intermediates_grid.svg', dpi = 100)
# tmap_save(tm = beginners_map, width = 8, height = 6,
#           filename = 'figs/beginners_grid.svg', dpi = 100)

coverage %>% 
  mutate(experts = scales::percent(experts/100, accuracy=1)) %>% 
  mutate(intermediates = scales::percent(intermediates/100, accuracy=1)) %>% 
  mutate(beginners = scales::percent(beginners/100, accuracy=1)) %>% 
  kableExtra::kbl(format.args = list(decimal.mark = '.', 
                                     big.mark = ","),
                  digits=2,
                  booktabs = T) %>% 
  kableExtra::kable_styling(latex_options = c('striped', 'hold_position'))
Percentage of geographic coverage by user category
experts intermediates beginners
42% 39% 41%

Traits

Code
##### TETRAPODS

# distribution
tetra_plot_distribution <- tetrapods %>% 
  group_by(user_category, dist_class) %>% count() %>% 
  ggplot(aes(x='', y=n, fill= factor(dist_class,
                                     levels = c('narrow', 'medium', 'wide')))) +
  geom_bar(width = 0.5, stat = 'identity', show.legend = T, 
           position = 'fill') + 
  labs(x='', y='Proportion of records', fill = 'distribution') + 
  facet_grid(~ factor(user_category, 
                      levels = c('beginner', 'intermediate', 'expert'))) + 
  scale_fill_brewer(palette = 'OrRd', name='distribution area') + 
 ggpubr::theme_pubclean() + theme(legend.position = 'bottom') +
  ggtitle('(a)') 

# size
tetra_plot_size <- tetrapods %>% 
  group_by(user_category, size_class) %>% count() %>% 
  ggplot(aes(x='', y=n, fill= factor(size_class,
                                     levels = c('small', 'medium', 'large')))) +
  geom_bar(width = 0.5, stat = 'identity', show.legend = T, 
           position = 'fill') + 
  labs(x='', y='', fill = 'size') + 
  facet_grid(~ factor(user_category, 
                      levels = c('beginner', 'intermediate', 'expert'))) + 
  scale_fill_brewer(palette = 'OrRd', name='body size') + 
  ggpubr::theme_pubclean() + theme(legend.position = 'bottom')  +
  ggtitle('(b)') 


# conservation status
tetra_plot_status <- tetrapods %>% 
  group_by(user_category, status) %>% count() %>% 
  ggplot(aes(x='', y=n, fill= factor(status,
  levels = c('NE', 'DD', 'CR', 'EN', 'VU', 'NT', 'LC')))) +
  geom_bar(width = 0.5, stat = 'identity', show.legend = T, 
           position = 'fill') + 
  labs(x='', y='', fill = 'IUCNglobal') + 
  facet_grid(~ factor(user_category, 
                      levels = c('beginner', 'intermediate', 'expert'))) + 
  scale_fill_brewer(palette = 'OrRd', name='conservation status') + 
  ggpubr::theme_pubclean() + theme(legend.position = 'bottom')  +
  ggtitle('(c)') 

tetra_plots <- tetra_plot_distribution | tetra_plot_size | tetra_plot_status

tetra_plots

Code
# ggsave(tetra_plots, dpi = 100, 
#        width = 18, height = 7, scale = 0.8,
#        file = 'figs/Figure_4.svg')

##### PLANTS

# distribution
plants_plot_distribution <- plants %>% 
  group_by(user_category, dist_class) %>% count() %>% 
  ggplot(aes(x='', y=n, fill= factor(dist_class,
                                     levels = c('narrow', 'medium', 'wide')))) +
  geom_bar(width = 0.5, stat = 'identity', show.legend = T, 
           position = 'fill') + 
  labs(x='', y='Proportion of records', fill = 'distribution') + 
  facet_grid(~ factor(user_category, 
                      levels = c('beginner', 'intermediate', 'expert'))) + 
  scale_fill_brewer(palette = 'Greens', name='distribution area') + 
 ggpubr::theme_pubclean() + theme(legend.position = 'bottom') +
  ggtitle('(a)') 

# size
plants_plot_growth <- plants %>%
  mutate(growth = factor(growth,
                              levels = c('tree', 
                                         'shrub', 
                                         'vine', 'herb',
                                         'liana', 'subshrub')),
         user_category = factor(user_category, 
                                levels = c('beginner', 
                                           'intermediate', 
                                           'expert'))) %>%
  group_by(user_category, growth) %>%
  count() %>%
  ggplot(aes(x = "", y = n, fill = growth)) +
  geom_bar(width = 0.5, stat = "identity", show.legend = TRUE, position = "fill") + 
  labs(x = "", y = "", fill = "growth form") + 
  facet_grid(~ user_category) + 
  scale_fill_brewer(palette = 'Greens', name='growth form') + 
 ggpubr::theme_pubclean() + theme(legend.position = 'bottom') +
  ggtitle('(b)') 


# conservation status
plants_plot_status <- plants %>% 
  group_by(user_category, status) %>% count() %>% 
  ggplot(aes(x='', y=n, fill= factor(status,
  levels = c('NE', 'DD', 'CR', 'EN', 'VU', 'NT', 'LC')))) +
  geom_bar(width = 0.5, stat = 'identity', show.legend = T, 
           position = 'fill') + 
  labs(x='', y='', fill = 'IUCNglobal') + 
  facet_grid(~ factor(user_category, 
                      levels = c('beginner', 'intermediate', 'expert'))) + 
  scale_fill_brewer(palette = 'Greens', name='conservation status') + 
  ggpubr::theme_pubclean() + theme(legend.position = 'bottom') +
  ggtitle('(c)') 


plants_plots <- plants_plot_distribution | plants_plot_growth | plants_plot_status

plants_plots

Code
# ggsave(plants_plots, dpi = 100, 
#        width = 18, height = 7, scale = 0.8,
#        file = 'figs/Figure_5.svg')
Code
##### TETRAPODS

# size plot
supp_size_plot <- ggplot(data = tetrapods, aes(x = size, fill=taxon)) +
  geom_histogram(position = 'identity', 
                 bins = 15, show.legend = F) +
  facet_wrap(taxon~., scales = 'free', ncol = 4) +
  scale_fill_brewer(palette = 'Reds', direction = -1) +
  ggpubr::theme_pubclean() +
  labs(x= 'size', y='number of records') + 
  ggtitle('(a)')

# distribution plot
supp_distribution_plot <- ggplot(data = tetrapods, aes(x = dist, fill=taxon)) +
  geom_histogram(position = 'identity', 
                 bins = 15, show.legend = F) +
  facet_wrap(taxon~., scales = 'free', ncol = 4) +
  scale_fill_brewer(palette = 'Reds', direction = -1) +
  ggpubr::theme_pubclean() +
  labs(x= 'distribution area', y='number of records') + 
  ggtitle('(b)')

# conservation status plot
supp_status_plot <- ggplot(data = tetrapods, aes(x = status, fill=taxon)) +
  geom_histogram(stat = 'count', show.legend = F) +
  facet_wrap(taxon~., scales = 'free', ncol = 4) +
  scale_fill_brewer(palette = 'Reds', direction = -1) +
  ggpubr::theme_pubclean() +
  labs(x= 'conservation status', y='number of records') + 
  ggtitle('(c)')

supp_tetra_plots <- supp_size_plot / supp_distribution_plot / supp_status_plot

supp_tetra_plots

Code
# ggsave(supp_tetra_plots, dpi = 100,
#        width = 10, height = 15, scale = 0.8,
#        file = 'figs/Figure_S2.svg')


##### PLANTS

# growth plot
supp_growth_plot <- ggplot(data = plants, aes(x = growth, fill=taxon)) +
  geom_histogram(stat = 'count', show.legend = F) +
  facet_wrap(taxon~., scales = 'free', ncol = 4) +
  scale_fill_brewer(palette = 'Greens', direction = -1) +
  ggpubr::theme_pubclean() +
  labs(x= 'growth form', y='number of records') + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ggtitle('(a)')

# distribution plot
supp_distribution_plot <- ggplot(data = plants, aes(x = dist, fill=taxon)) +
  geom_histogram(position = 'identity', 
                 bins = 15, show.legend = F) +
  facet_wrap(taxon~., scales = 'free', ncol = 4) +
  scale_fill_brewer(palette = 'Greens', direction = -1) +
  ggpubr::theme_pubclean() +
  labs(x= 'distribution area', y='number of records') + 
  ggtitle('(b)')

# conservation status plot
supp_status_plot <- ggplot(data = plants, aes(x = status, fill=taxon)) +
  geom_histogram(stat = 'count', show.legend = F) +
  facet_wrap(taxon~., scales = 'free', ncol = 4) +
  scale_fill_brewer(palette = 'Greens', direction = -1) +
  ggpubr::theme_pubclean() +
  labs(x= 'conservation status', y='number of records') + 
  ggtitle('(c)')


supp_plants_plots <- supp_growth_plot / supp_distribution_plot / supp_status_plot

supp_plants_plots

Code
# ggsave(supp_plants_plots, dpi = 100,
#        width = 10, height = 15, scale = 0.8,
#        file = 'figs/Figure_S3.svg')

Statistical analyses

Tetrapods

Hypothesis 1

Code
mod_tetra <- lm(ranking ~ dist_scaled + size_scaled + status,
                data = tetrapods)

# summary(mod_tetra)

car::residualPlots(mod_tetra)

            Test stat Pr(>|Test stat|)    
dist_scaled    3.0475        0.0023099 ** 
size_scaled   -3.3552        0.0007945 ***
status                                    
Tukey test    -0.1987        0.8425336    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
plot(predictorEffect(predictor = 'dist_scaled', mod = mod_tetra))

Code
plot(predictorEffect(predictor = 'size_scaled', mod = mod_tetra))

Hypothesis 2

Code
tapply(tetrapods$dist, INDEX = tetrapods$user_category, mean)
      expert intermediate     beginner 
    16.77023     17.38902     17.43221 
Code
tapply(tetrapods$dist, INDEX = tetrapods$user_category, sd)
      expert intermediate     beginner 
    4.766938     4.124072     4.101917 
Code
mod_tetra_user_mean <- lm(ranking ~ mean_size + mean_dist,
                          data = tetrapods_per_user)
# summary(mod_tetra_user_mean)

mod_tetra_user_sd <- lm(ranking ~ sd_dist + sd_size,
                        data = tetrapods_per_user)
# summary(mod_tetra_user_sd)

pred_mod_tetra_user_mean <- as.data.frame(predictorEffects(mod = 
                                 lm(ranking ~ mean_dist + mean_size, 
                                    data = tetrapods_per_user)))

plot_pred_mean_dist <- ggplot(data = pred_mod_tetra_user_mean$mean_dist, 
       aes(x = mean_dist, y = fit)) +
  geom_point(data = tetrapods_per_user, 
             aes(y = ranking, x = mean_dist, col=user_category),
             show.legend = F) +
  geom_ribbon(aes(ymin = lower, ymax = upper), 
              fill='grey40', alpha=0.5) +
  geom_line(col='black', linewidth=1, linetype = 'dashed') +  
  labs(y='ranking', x='mean(distribution) per user', col='') +
  # ylim(c(0,600))+
  ggpubr::theme_classic2()

plot_pred_mean_size <- ggplot(data = pred_mod_tetra_user_mean$mean_size, 
       aes(x = mean_size, y = fit)) +
  geom_point(data = tetrapods_per_user, 
             aes(y = ranking, x = mean_size, col=user_category),
             show.legend = F) +
  geom_ribbon(aes(ymin = lower, ymax = upper), 
              fill='grey40', alpha=0.5) +
  geom_line(col='black', linewidth=1, linetype = 'dashed') + 
  labs(y='ranking', x='mean(size) per user', col='') +
  # ylim(c(0,600))+
  ggpubr::theme_classic2()

pred_mod_tetra_user_sd <- as.data.frame(predictorEffects(mod = 
                                 lm(ranking ~ sd_dist + sd_size, 
                                    data = tetrapods_per_user)))

plot_pred_sd_dist <- ggplot(data = pred_mod_tetra_user_sd$sd_dist, 
       aes(x = sd_dist, y = fit)) +
  geom_point(data = tetrapods_per_user, 
             aes(y = ranking, x = sd_dist, col=user_category)) +
  geom_ribbon(aes(ymin = lower, ymax = upper), 
              fill='grey40', alpha=0.5) +
  geom_line(col='black', linewidth=1, linetype = 'dashed') +  
  labs(y='', x='SD(distribution) per user', col='') +
  # ylim(c(0,600))+
  ggpubr::theme_classic2()

plot_pred_sd_size <- ggplot(data = pred_mod_tetra_user_sd$sd_size, 
       aes(x = sd_size, y = fit)) +
  geom_point(data = tetrapods_per_user, 
             aes(y = ranking, x = sd_size, col=user_category)) +
  geom_ribbon(aes(ymin = lower, ymax = upper), 
              fill='grey40', alpha=0.5) +
  geom_line(col='black', linewidth=1, linetype = 'dashed') + 
  labs(y='', x='SD(size) per user', col='') +
  # ylim(c(0,600))+
  ggpubr::theme_classic2()

plot_pred_mean_size | plot_pred_sd_size

Code
plot_pred_mean_dist | plot_pred_sd_dist

Models’ summary

Code
stargazer(mod_tetra, mod_tetra_user_mean, mod_tetra_user_sd,
          ci = T, digits=1,
          type='html',
          title = 'tetrapods')
tetrapods
Dependent variable:
ranking
(1) (2) (3)
dist_scaled 4.6***
(2.7, 6.4)
size_scaled 2.5***
(0.8, 4.2)
statusEN -49.2
(-229.6, 131.3)
statusVU -25.1
(-204.4, 154.2)
statusNT -38.1
(-217.1, 140.9)
statusDD -21.3
(-203.5, 161.0)
statusNE -10.4
(-189.5, 168.6)
statusLC -28.6
(-207.4, 150.2)
mean_size 0.4
(-0.3, 1.1)
mean_dist 17.0***
(5.6, 28.5)
sd_dist -24.9***
(-34.7, -15.0)
sd_size 0.2
(-0.3, 0.7)
Constant 95.6 -3.7 376.9***
(-83.2, 274.4) (-213.1, 205.8) (338.3, 415.5)
Observations 22,918 358 358
R2 0.003 0.02 0.1
Adjusted R2 0.003 0.02 0.1
Residual Std. Error 128.9 (df = 22909) 239.7 (df = 355) 234.2 (df = 355)
F Statistic 8.5*** (df = 8; 22909) 4.3** (df = 2; 355) 12.9*** (df = 2; 355)
Note: p<0.1; p<0.05; p<0.01

Plants

Hypothesis 1

Code
mod_plants <- lm(ranking ~ dist_scaled + growth + status, 
                data = plants)
# summary(mod_plants)

car::residualPlots(mod_plants)

            Test stat Pr(>|Test stat|)
dist_scaled    -0.882           0.3778
growth                                
status                                
Tukey test     -0.601           0.5478
Code
plot(predictorEffect(predictor = 'dist_scaled', mod = mod_plants))

Hypothesis 2

Code
tapply(plants$dist, INDEX = plants$user_category, mean)
      expert intermediate     beginner 
    7.464803     7.402016     7.282389 
Code
tapply(plants$dist, INDEX = plants$user_category, sd)
      expert intermediate     beginner 
    4.769118     4.534960     4.649368 
Code
mod_plants_user_mean <- lm(ranking ~ mean_dist,
                          data = plants_per_user)
# summary(mod_plants_user_mean)

mod_plants_user_sd <- lm(ranking ~ sd_dist,
                        data = plants_per_user)
# summary(mod_plants_user_sd)

pred_mod_plants_user_mean <- as.data.frame(predictorEffects(mod = 
                                 lm(ranking ~ mean_dist, 
                                    data = plants_per_user)))

plot_pred_mean_dist <- ggplot(data = pred_mod_plants_user_mean$mean_dist,
                              aes(x = mean_dist, y = fit)) +
  geom_point(data = plants_per_user, 
             aes(y = ranking, x = mean_dist, col=user_category),
             show.legend = F) +
  geom_ribbon(aes(ymin = lower, ymax = upper), 
              fill='grey40', alpha=0.5) +
  geom_line(col='black', linewidth=1, linetype = 'dashed') +  
  labs(y='', x='mean(distribution) per user') +
  # ylim(c(0,600))+
  ggpubr::theme_classic2()


pred_mod_plants_user_sd <- as.data.frame(predictorEffects(mod = 
                                 lm(ranking ~ sd_dist, 
                                    data = plants_per_user)))

plot_pred_sd_dist <- ggplot(data = pred_mod_plants_user_sd$sd_dist, 
       aes(x = sd_dist, y = fit)) +
  geom_point(data = plants_per_user, 
             aes(y = ranking, x = sd_dist, col=user_category)) +
  geom_ribbon(aes(ymin = lower, ymax = upper), 
              fill='grey40', alpha=0.5) +
  geom_line(col='black', linewidth=1, linetype = 'dashed') +  
  labs(y='ranking', x='SD(distribution) per user', col='') +
  # ylim(c(0,600))+
  ggpubr::theme_classic2()

plot_pred_mean_dist | plot_pred_sd_dist

Models’ summary

Code
stargazer(mod_plants, 
          mod_plants_user_mean, 
          mod_plants_user_sd,
          ci = T,  digits=1,
          type='html',
          title = 'plants')
plants
Dependent variable:
ranking
(1) (2) (3)
dist_scaled -2.0
(-4.6, 0.6)
growthvine -4.2
(-28.7, 20.3)
growthliana 3.5
(-27.0, 34.0)
growthsubshrub 4.2
(-3.8, 12.3)
growthshrub 1.9
(-5.0, 8.8)
growthtree 12.8**
(0.6, 25.0)
statusEN 29.4
(-111.9, 170.6)
statusVU 18.4
(-113.1, 149.8)
statusNT 3.1
(-147.4, 153.7)
statusDD 147.5
(-37.0, 332.0)
statusNE 45.2
(-85.2, 175.7)
statusLC 43.6
(-87.1, 174.2)
mean_dist -14.6**
(-28.0, -1.1)
sd_dist -23.9**
(-42.5, -5.4)
Constant 33.1 380.5*** 378.0***
(-97.3, 163.5) (280.1, 480.9) (294.6, 461.4)
Observations 10,821 291 291
R2 0.002 0.02 0.02
Adjusted R2 0.001 0.01 0.02
Residual Std. Error 133.0 (df = 10808) 217.0 (df = 289) 216.3 (df = 289)
F Statistic 2.1** (df = 12; 10808) 4.5** (df = 1; 289) 6.4** (df = 1; 289)
Note: p<0.1; p<0.05; p<0.01